#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 31 19:41:27 2022

@author: jerry
"""
import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
import time
import base64
import math

#以下数据为c1.pdf解析后获取，为航天计量的证书数据点位置，其中温度数据共9个点，代表9个计量数据
points_array={'sn':[603,875]}
element_array=[[663,1105],[663,1136],[661,1165],[661,1193],
              [661,1225],[660,1254],[659,1286],[655,1315],[655,1346]] 

def image_to_base64(file_path):
    """
    将pdf转为Base64流
    :param pdf_path: PDF文件路径
    :return:
    """
    with open(file_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return str(encoded_string, 'UTF-8')

def ocr_to_jason(filename):
    #如何返回每页的序列化对象？
    try:
        # 实例化一个认证对象，入参需要传入腾讯云账户secretId，secretKey,此处还需注意密钥对的保密
        # 密钥可前往https://console.cloud.tencent.com/cam/capi网站进行获取
        cred = credential.Credential("AKIDDG3btue7JEuKUO1OIvHMBk22lFSO3RYO", "atwiDRQ8alfJfkfKiodBSSBeqkNkqeBR")
        # 实例化一个http选项，可选的，没有特殊需求可以跳过
        httpProfile = HttpProfile()
        httpProfile.endpoint = "ocr.tencentcloudapi.com"
    
        # 实例化一个client选项，可选的，没有特殊需求可以跳过
        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        # 实例化要请求产品的client对象,clientProfile是可选的
        client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)
        
        
        imageurl= image_to_base64(filename)
        #获取pdf文档的页数,当前暂时定为30页，api里未查到页数的获取
        pdf_page_count=6
        # pdf_conent_json=[]
        for page_index in range(pdf_page_count):
            # 实例化一个请求对象,每个接口都会对应一个request对象,当文件为pdf时，增加ispdf字段
            req = models.GeneralAccurateOCRRequest()
            params = {
                "ImageBase64": imageurl,
                "IsPdf": True,
                "PdfPageNumber": page_index+1
            }
            req.from_json_string(json.dumps(params))
        
            # 返回的resp是一个GeneralAccurateOCRResponse的实例，与请求对象对应
            resp = client.GeneralAccurateOCR(req)
            # 输出json格式的字符串回包
            pdf_conent_json=json.loads(resp.to_json_string())
            #加入当前页面解析，写入文件
            
           #//test 
            # fw= open("res.json", "a") 
            # fw.write(resp.to_json_string())
            # fw.close()
            # exit()
            #//test end
            
            PdfPageNumber=page_index+1
            if  (PdfPageNumber% 2) != 0:
            #奇数页面
                dat1=page_one(pdf_conent_json)
                print ('当前页数：',PdfPageNumber)
            else:
            #偶数页面
                dat2=page_two(pdf_conent_json)
                print ('当前页数：',PdfPageNumber)
            #写入文件
                dat=[dat1]+dat2
                str_csv=[]
                for istr in  dat:
                    str_csv.append(istr)                    
                    str_csv.append(',')
                str_csv.pop()
                filename=time.strftime("%Y-%m-%d",time.localtime())+'.csv'
                fo = open(filename, "a")
                print(str_csv)
                fo.writelines(str_csv)
                fo.write('\n')
                fo.close()
            
        return pdf_conent_json
    except TencentCloudSDKException as err:
        print(err)  
    
    #test begin
    # pdf_json=json.loads(json_str2)
    # print(pdf_json)
    #航天计量的证书解析格式
def page_one(pdf_json):     
    #第一页
    #检测单位
    # a1=(pdf_json['TextDetections'][0]['DetectedText'])
    #证书编号
    # a2=(pdf_json['TextDetections'][3]['DetectedText'])
    #出厂编号
    sn='null'
    for itempolys_axis in pdf_json['TextDetections']:
        x_axis=itempolys_axis['ItemPolygon']['X']
        y_axis=itempolys_axis['ItemPolygon']['Y']
        dat1=[x_axis,y_axis]
        # print (dat1)
        if point_is_in_range(dat1, points_array['sn']):
            sn= itempolys_axis['DetectedText']  
            break             
    return sn
    
    
def page_two(pdf_json):      
    #第二页
    dat_array=[]
    idx=0
    for itempolys_axis in pdf_json['TextDetections']:
        x_axis=itempolys_axis['ItemPolygon']['X']
        y_axis=itempolys_axis['ItemPolygon']['Y']
        dat1=[x_axis,y_axis]
        # print (dat1)
        if point_is_in_range(dat1, element_array[idx]):
            data=itempolys_axis['DetectedText']
            # print(data)
            dat_array.append(data)
            idx+=1
        # print('idx:',idx)
        ic_count=len(element_array)-1
        if idx>ic_count:
           break
    return dat_array  

#判断一点是否在指定范围圆内，用于判断文字坐标是否在指定范围,两点偏差取不到10个点范围(已测试)
def point_is_in_range(dat1,dat2):    
    distance=math.sqrt(pow((dat1[0]-dat2[0]),2)+pow((dat1[1]-dat2[1]),2))
    max_enable_Deviance=10
    if distance<=max_enable_Deviance:
        return True
    else:
        return False
    
ocr_to_jason('/home/jerry/python/ocr/c1.pdf')   



